Rishav Eliyahu Sofer
8/15/2024
The data I have selected is from a longitudinal study that collected data on cancer deaths across various countries and cancer types between 1990 and 2016.
Today I will be showing basic analysis of this cancer mortality data to see if any significant trends can be identified.
Note: Deaths are in the thousands.
The data was collected from Kaggle and is part of the dataset titled ‘Cancer Deaths by Country and Type (1990-2016)’, which provides comprehensive information on cancer mortality trends across different countries and cancer types over a 26-year period.
#Importing important libraries (All may not be used but these are generally imporant for this kind of data analysis)
library(tidyverse) ## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(stats)
# Reading in the .csv file from the raw GitHub URL.
cancer <- read.csv("https://raw.githubusercontent.com/rishavsofer/NYU_R_Project/main/Cancer%20Deaths%20by%20Country%20and%20Type%20Dataset.csv")
# Showing the top rows of the data for visualization.
head(cancer)## Country Code Year Liver.cancer Kidney.cancer Larynx.cancer Breast.cancer
## 1 Afghanistan AFG 1990 243.6637 39.47049 109.3342 766.5354
## 2 Afghanistan AFG 1991 261.2418 41.37602 117.3117 823.2339
## 3 Afghanistan AFG 1992 284.4436 44.10631 128.0716 901.0221
## 4 Afghanistan AFG 1993 313.1368 47.42485 141.4296 996.4328
## 5 Afghanistan AFG 1994 343.2297 50.71095 155.7546 1097.8952
## 6 Afghanistan AFG 1995 371.9291 53.81093 168.4818 1186.0278
## Thyroid.cancer Stomach.cancer Bladder.cancer Uterine.cancer Ovarian.cancer
## 1 79.82017 923.4952 148.1392 108.1194 68.99470
## 2 85.11102 989.7096 156.9774 115.0684 73.51170
## 3 92.24060 1078.4590 168.9905 124.5071 79.58896
## 4 101.20673 1192.0645 184.3477 136.4029 86.60245
## 5 110.67992 1316.5057 200.2469 149.3039 93.47354
## 6 119.03708 1425.6272 214.5046 160.8058 99.79714
## Cervical.cancer Prostate.cancer Pancreatic.cancer Esophageal.cancer
## 1 317.0996 130.3028 125.9848 189.3686
## 2 336.3375 139.5326 133.8216 200.2654
## 3 362.1939 151.5814 144.4897 215.1851
## 4 394.2766 167.7904 157.3959 234.1602
## 5 428.1947 184.7531 170.3166 254.3053
## 6 457.1709 200.9940 182.4932 271.9487
## Testicular.cancer Nasopharynx.cancer Other.pharynx.cancer
## 1 27.49423 73.29566 40.96262
## 2 29.03400 78.31531 43.15965
## 3 31.24132 85.00778 46.18636
## 4 33.99369 92.96377 49.87100
## 5 36.90399 100.94951 53.62337
## 6 39.27382 107.83341 56.79263
## Colon.and.rectum.cancer Non.melanoma.skin.cancer Lip.and.oral.cavity.cancer
## 1 442.1571 26.44616 53.59964
## 2 476.4320 28.27527 57.14889
## 3 521.9227 30.71815 61.87610
## 4 577.3300 33.83544 67.50486
## 5 635.4031 37.10337 73.17588
## 6 688.4817 40.04284 78.31936
## Brain.and.nervous.system.cancer Tracheal..bronchus..and.lung.cancer.
## 1 163.8691 797.2657
## 2 174.1832 853.1264
## 3 188.3823 927.8128
## 4 205.2504 1017.9647
## 5 222.3836 1110.9972
## 6 239.0922 1195.7509
## Gallbladder.and.biliary.tract.cancer Malignant.skin.melanoma Leukemia
## 1 125.9362 14.29398 727.7634
## 2 133.7814 15.24105 766.0402
## 3 144.2876 16.50883 820.9565
## 4 157.4710 18.03851 891.1342
## 5 171.3113 19.60664 965.2861
## 6 183.7626 21.04178 1033.6420
## Hodgkin.lymphoma Multiple.myeloma Other.cancers
## 1 191.3674 50.71944 294.8397
## 2 203.5096 54.31764 311.4691
## 3 220.2080 59.14420 334.5660
## 4 240.7188 64.93858 362.4867
## 5 262.3241 70.93035 390.3347
## 6 280.9813 76.48989 416.2341
The data required re-naming columns for consistency and re-organization into long format for ease of analysis.
# Renaming columns for consistency
cancer_renamed <- cancer %>%
rename(
Country = "Country",
Code = "Code",
Year = "Year",
Liver_Cancer = "Liver.cancer",
Kidney_Cancer = "Kidney.cancer",
Larynx_Cancer = "Larynx.cancer",
Breast_Cancer = "Breast.cancer",
Thyroid_Cancer = "Thyroid.cancer",
Stomach_Cancer = "Stomach.cancer",
Bladder_Cancer = "Bladder.cancer",
Uterine_Cancer = "Uterine.cancer",
Ovarian_Cancer = "Ovarian.cancer",
Cervical_Cancer = "Cervical.cancer",
Prostate_Cancer = "Prostate.cancer",
Pancreatic_Cancer = "Pancreatic.cancer",
Esophageal_Cancer = "Esophageal.cancer",
Testicular_Cancer = "Testicular.cancer",
Nasopharynx_Cancer = "Nasopharynx.cancer",
Other_Phariynx_Cancer = "Other.pharynx.cancer",
Colon_Rectum_Cancer = "Colon.and.rectum.cancer",
Non_Melanoma_Skin_Cancer = "Non.melanoma.skin.cancer",
Lip_Oral_Cavity_Cancer = "Lip.and.oral.cavity.cancer",
Brain_Nervous_System_Cancer = "Brain.and.nervous.system.cancer",
Tracheal_Bronchus_Lung_Cancer = "Tracheal..bronchus..and.lung.cancer.",
Gallbladder_Biliary_Tract_Cancer = "Gallbladder.and.biliary.tract.cancer",
Malignant_Skin_Melanoma = "Malignant.skin.melanoma",
Leukemia = "Leukemia",
Hodgkin_Lymphoma = "Hodgkin.lymphoma",
Multiple_Myeloma = "Multiple.myeloma",
Other_Cancers = "Other.cancers"
)
# Previewing the renamed columns
colnames(cancer_renamed)## [1] "Country" "Code"
## [3] "Year" "Liver_Cancer"
## [5] "Kidney_Cancer" "Larynx_Cancer"
## [7] "Breast_Cancer" "Thyroid_Cancer"
## [9] "Stomach_Cancer" "Bladder_Cancer"
## [11] "Uterine_Cancer" "Ovarian_Cancer"
## [13] "Cervical_Cancer" "Prostate_Cancer"
## [15] "Pancreatic_Cancer" "Esophageal_Cancer"
## [17] "Testicular_Cancer" "Nasopharynx_Cancer"
## [19] "Other_Phariynx_Cancer" "Colon_Rectum_Cancer"
## [21] "Non_Melanoma_Skin_Cancer" "Lip_Oral_Cavity_Cancer"
## [23] "Brain_Nervous_System_Cancer" "Tracheal_Bronchus_Lung_Cancer"
## [25] "Gallbladder_Biliary_Tract_Cancer" "Malignant_Skin_Melanoma"
## [27] "Leukemia" "Hodgkin_Lymphoma"
## [29] "Multiple_Myeloma" "Other_Cancers"
# Reshaping the data to long format
cancer_long <- cancer_renamed %>%
pivot_longer(cols = c(
Liver_Cancer, Kidney_Cancer, Larynx_Cancer, Breast_Cancer,
Thyroid_Cancer, Stomach_Cancer, Bladder_Cancer, Uterine_Cancer,
Ovarian_Cancer, Cervical_Cancer, Prostate_Cancer, Pancreatic_Cancer,
Esophageal_Cancer, Testicular_Cancer, Nasopharynx_Cancer,
Other_Phariynx_Cancer, Colon_Rectum_Cancer, Non_Melanoma_Skin_Cancer,
Lip_Oral_Cavity_Cancer, Brain_Nervous_System_Cancer,
Tracheal_Bronchus_Lung_Cancer, Gallbladder_Biliary_Tract_Cancer,
Malignant_Skin_Melanoma, Leukemia, Hodgkin_Lymphoma,
Multiple_Myeloma, Other_Cancers),
names_to = "Cancer_Type",
values_to = "Deaths"
)
# Previewing the long format data
head(cancer_long)## # A tibble: 6 × 5
## Country Code Year Cancer_Type Deaths
## <chr> <chr> <int> <chr> <dbl>
## 1 Afghanistan AFG 1990 Liver_Cancer 244.
## 2 Afghanistan AFG 1990 Kidney_Cancer 39.5
## 3 Afghanistan AFG 1990 Larynx_Cancer 109.
## 4 Afghanistan AFG 1990 Breast_Cancer 767.
## 5 Afghanistan AFG 1990 Thyroid_Cancer 79.8
## 6 Afghanistan AFG 1990 Stomach_Cancer 923.
To check if there are any NA’s in case they needed to be dealt with for more efficient analysis
# Counting NAs in each column
na_count <- cancer_long %>%
summarise(across(everything(), ~ sum(is.na(.))))
# Preview the result
na_count## # A tibble: 1 × 5
## Country Code Year Cancer_Type Deaths
## <int> <int> <int> <int> <int>
## 1 0 756 0 0 0
NA’s were found in the ‘Code’ column which is a column that codes country names so a closer look is taken at that
# Filter the rows where 'Code' is NA
na_codes <- cancer_long %>%
filter(is.na(Code))
# Preview the rows with NA in 'Code' column
head(na_codes)## # A tibble: 6 × 5
## Country Code Year Cancer_Type Deaths
## <chr> <chr> <int> <chr> <dbl>
## 1 Caribbean <NA> 1990 Liver_Cancer 1644.
## 2 Caribbean <NA> 1990 Kidney_Cancer 537.
## 3 Caribbean <NA> 1990 Larynx_Cancer 758.
## 4 Caribbean <NA> 1990 Breast_Cancer 2515.
## 5 Caribbean <NA> 1990 Thyroid_Cancer 127.
## 6 Caribbean <NA> 1990 Stomach_Cancer 2897.
## Country Code Year Cancer_Type
## Length:756 Length:756 Min. :1990 Length:756
## Class :character Class :character 1st Qu.:1996 Class :character
## Mode :character Mode :character Median :2002 Mode :character
## Mean :2003
## 3rd Qu.:2009
## Max. :2016
## Deaths
## Min. : 37.92
## 1st Qu.: 4863.98
## Median : 12739.30
## Mean : 22361.27
## 3rd Qu.: 21418.05
## Max. :213062.16
Based on the results of the head(na_codes) prompt, we are checking what Country names are associated with NA’s
# Get a list of unique countries where 'Code' is NA
countries_with_na_code <- na_codes %>%
distinct(Country) %>%
pull(Country)
# Print the list of countries
print(countries_with_na_code)## [1] "Caribbean" "North America"
Decision: Omit NAs as well as other regional categories and retain only country-specific data
#Omitting all NAs
cancer.clean <- na.omit(cancer_long)
#Specifying the regions to exclude
regions <- c("World", "East Asia", "Western Europe", "Africa","Eastern Europe","Southeast Asia","South Asia","Latin America and Caribbean","Sub-Saharan Africa")
# Filter the dataset
cancer.clean <- cancer.clean %>%
filter(!Country %in% regions)# Calculating total deaths per year
total_deaths <- cancer.clean %>%
group_by(Year) %>%
summarize(Total_Deaths = sum(Deaths))
# Linear model
model <- lm(Total_Deaths ~ Year, data = total_deaths)
# Creating the plot with linear fit
ggplot(total_deaths, aes(x = Year, y = Total_Deaths)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Global Cancer Deaths Over Time with Linear Fit",
x = "Year",
y = "Total Deaths")## `geom_smooth()` using formula = 'y ~ x'
##
## Call:
## lm(formula = Total_Deaths ~ Year, data = total_deaths)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96175 -63062 -1096 37965 178059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -276876408 3383910 -81.82 <2e-16 ***
## Year 142565 1689 84.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68370 on 25 degrees of freedom
## Multiple R-squared: 0.9965, Adjusted R-squared: 0.9964
## F-statistic: 7121 on 1 and 25 DF, p-value: < 2.2e-16
# To analyze cancer type-specific trends, we'll need to calculate the proportion of deaths for each cancer type per year
cancer_type_proportions <- cancer.clean %>%
group_by(Year, Cancer_Type) %>%
summarize(Total_Deaths = sum(Deaths)) %>%
group_by(Year) %>%
mutate(Proportion = Total_Deaths / sum(Total_Deaths))## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# A stacked area chart is suitable for visualizing proportions over time:
ggplot(cancer_type_proportions, aes(x = Year, y = Proportion, fill = Cancer_Type)) +
geom_area() +
labs(title = "Proportion of Cancer Deaths by Type Over Time",
x = "Year",
y = "Proportion of Deaths")
Closer Look: Look at only the top 5 cancer types
# Filtering by top cancer types
top_cancer_types <- cancer.clean %>%
group_by(Cancer_Type) %>%
summarize(Total_Deaths = sum(Deaths)) %>%
arrange(desc(Total_Deaths)) %>%
head(n = 5)
top_cancer_data <- cancer.clean %>%
filter(Cancer_Type %in% top_cancer_types$Cancer_Type) %>%
group_by(Year, Cancer_Type) %>%
summarize(Total_Deaths = sum(Deaths)) %>%
group_by(Year) %>%
mutate(Proportion = Total_Deaths / sum(Total_Deaths))## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
ggplot(top_cancer_data, aes(x = Year, y = Proportion, fill = Cancer_Type)) +
geom_area() +
labs(title = "Proportion of Deaths for Top Cancer Types Over Time",
x = "Year",
y = "Proportion of Deaths")# Calculating total deaths per country
total_deaths <- cancer.clean %>%
group_by(Country) %>%
summarize(Total_Deaths = sum(Deaths))
# Ranking countries by total deaths
ranked_countries <- total_deaths %>%
arrange(desc(Total_Deaths))
# Getting top and bottom 5 countries
top_5 <- head(ranked_countries, 5)
# Creating a ggplot object
p <- ggplot(top_5, aes(x = reorder(Country, Total_Deaths), y = Total_Deaths, fill = Country)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 5 Countries by Total Cancer Deaths",
x = "Country",
y = "Total Deaths") +
theme(legend.position = "none")
# Converting to interactive plot
ggplotly(p)## # A tibble: 5 × 2
## Country Total_Deaths
## <chr> <dbl>
## 1 China 53065391.
## 2 United States 15089092.
## 3 India 14805223.
## 4 Japan 8268955.
## 5 Russia 8069324.
Closer Look: Look at the order of magnitude difference between the top 5 countries by total deaths
# Calculating order of magnitude for Total Deaths
top_5 <- top_5 %>%
mutate(Magnitude = floor(log10(Total_Deaths)))
ggplot(top_5, aes(x = Country, y = Magnitude)) +
geom_bar(stat = "identity") +
labs(title = "Order of Magnitude of Total Deaths by Country",
x = "Country",
y = "Order of Magnitude")## # A tibble: 5 × 3
## Country Total_Deaths Magnitude
## <chr> <dbl> <dbl>
## 1 China 53065391. 7
## 2 United States 15089092. 7
## 3 India 14805223. 7
## 4 Japan 8268955. 6
## 5 Russia 8069324. 6
Discussion
Future Directions
The results of this project point to many future directions in terms of research, including: